{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "CzXUV7O25sOM"
   },
   "source": [
    "# Manipulation de plusieurs séquences (TensorFlow)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "40CGRmXR5sOP"
   },
   "source": [
    "Installez la bibliothèque 🤗 *Transformers* pour exécuter ce *notebook*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "b2eoLoIp5sOQ"
   },
   "outputs": [],
   "source": [
    "!pip install transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "liMUcXsS5sOR"
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n",
    "\n",
    "checkpoint = \"tblard/tf-allocine\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "sequence = \"J'ai attendu un cours d’HuggingFace toute ma vie.\"\n",
    "\n",
    "tokens = tokenizer.tokenize(sequence)\n",
    "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
    "input_ids = tf.constant(ids)\n",
    "# Cette ligne va échouer\n",
    "model(input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "FcY3Ehyz5sOT"
   },
   "outputs": [],
   "source": [
    "tokenized_inputs = tokenizer(sequence, return_tensors=\"tf\")\n",
    "print(tokenized_inputs[\"input_ids\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "r9g_7WTB5sOV"
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n",
    "\n",
    "checkpoint = \"tblard/tf-allocine\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "sequence = \"J'ai attendu un cours d’HuggingFace toute ma vie.\"\n",
    "\n",
    "\n",
    "tokens = tokenizer.tokenize(sequence)\n",
    "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
    "\n",
    "input_ids = tf.constant([ids])\n",
    "print(\"Input IDs:\", input_ids)\n",
    "\n",
    "output = model(input_ids)\n",
    "print(\"Logits:\", output.logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "_KWu4__95sOX"
   },
   "outputs": [],
   "source": [
    "batched_ids = [\n",
    "    [200, 200, 200],\n",
    "    [200, 200]\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UxkpCEXG5sOY"
   },
   "outputs": [],
   "source": [
    "padding_id = 100\n",
    "\n",
    "batched_ids = [\n",
    "    [200, 200, 200],\n",
    "    [200, 200, padding_id],\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UDqkStbe5sOZ"
   },
   "outputs": [],
   "source": [
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "\n",
    "sequence1_ids = [[200, 200, 200]]\n",
    "sequence2_ids = [[200, 200]]\n",
    "batched_ids = [\n",
    "    [200, 200, 200],\n",
    "    [200, 200, tokenizer.pad_token_id],\n",
    "]\n",
    "\n",
    "print(model(tf.constant(sequence1_ids)).logits)\n",
    "print(model(tf.constant(sequence2_ids)).logits)\n",
    "print(model(tf.constant(batched_ids)).logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "TpBRXe1v5sOb"
   },
   "outputs": [],
   "source": [
    "batched_ids = [\n",
    "    [200, 200, 200],\n",
    "    [200, 200, tokenizer.pad_token_id],\n",
    "]\n",
    "\n",
    "attention_mask = [\n",
    "    [1, 1, 1],\n",
    "    [1, 1, 0],\n",
    "]\n",
    "\n",
    "outputs = model(tf.constant(batched_ids), attention_mask=tf.constant(attention_mask))\n",
    "print(outputs.logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "t5UgI12j5sOc"
   },
   "outputs": [],
   "source": [
    "# max_sequence_length = 512\n",
    "equence = sequence[:max_sequence_length]"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
